In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
In [2]:
import sqlite3

import numpy as np
import pandas as pd
import seaborn as sns
# matplotlibのグラフをRetinaの高解像度で表示する
%config InlineBackend.figure_formats = {'png', 'retina'}
# Jupyter Notebookの中で作図した画像を表示させる
%matplotlib inline
# matplotlib をインポートする
import matplotlib.pyplot as plt
# 図のサイズを12inch x 12inch = 864px X 864px にする
plt.rcParams['figure.figsize'] = 16, 9
# 日本語タイトルのため、japanizeをインポートする
import japanize_matplotlib
plt.rcParams['font.family'] = 'IPAexGothic'
In [3]:
pwd
Out[3]:
'C:\\Users\\tkjyoke\\pythonProject\\sowi_com'
In [4]:
cd r361
[WinError 2] 指定されたファイルが見つかりません。: 'r361'
C:\Users\tkjyoke\pythonProject\sowi_com
In [5]:
cd D:
D:\
In [6]:
cd jupyter
D:\jupyter
In [7]:
ls
 ドライブ D のボリューム ラベルは ボリューム です
 ボリューム シリアル番号は C64A-827C です

 D:\jupyter のディレクトリ

2023/04/12  16:42    <DIR>          .
2023/04/12  16:42    <DIR>          ..
2022/12/27  16:26    <DIR>          .ipynb_checkpoints
2022/10/07  13:12            60,342 【Master_to_USE】https_log_read_04_28_glob_used.ipynb
2022/12/01  14:12            33,357 【ローデータ1125】1201.xlsx
2022/12/12  16:22            44,539 【ローデータ1206】1212.xlsx
2022/12/12  16:23           244,750 【視聴データ1206】1212.pdf
2023/01/06  13:01            28,400 03-03-merging.ipynb
2022/12/01  14:36           107,260 11_25視聴分析.pptx
2022/11/10  12:05            66,631 1101_view_statics.pptx
2022/11/10  11:27            62,678 1101ローデータ(1104).xlsx
2022/12/13  08:59           186,467 12_06_11_25視聴分析.pptx
2022/04/14  16:42            37,549 132_jirei_addr_non_dup_04_14_r4.ipynb
2022/04/14  15:03           220,752 2E546500
2022/03/08  12:52           370,631 46729840
2022/05/06  16:40            57,584 8FC75E20
2022/02/15  13:33           177,571 ad_pl_qa_02_14_todays_final.ipynb
2022/02/15  15:49           300,404 ad_pl_qa_02_15_final.ipynb
2022/02/21  10:23           594,508 ad_pl_qa_02_16_fina[.ipynb
2022/02/24  11:41           695,928 ad_pl_qa_02_22_final_v4-Copy1.ipynb
2022/02/24  16:44           225,352 ad_pl_qa_02_22_final_v5-Copy1.ipynb
2022/02/28  15:45           106,290 ad_pl_qa_02_28_final_v10_for_list.ipynb
2022/02/28  13:02         2,057,652 ad_pl_qa_02_28_noon_v8.ipynb
2022/03/02  15:23         1,836,188 ad_pl_qa_03_02_noon_v14.ipynb
2022/03/02  12:00           291,971 ad_pl_qa_03_02_noon_v14-Copy1.ipynb
2022/03/03  10:53         2,298,282 ad_pl_qa_03_03_w_ct_v2_bk.ipynb
2022/03/08  08:38           370,738 AD_PL_QA_03_08_203ans_final.xlsx
2022/03/16  15:07           226,818 ad_pl_qa_03_16_cross_table_v4_203ans-backup.ipynb
2022/03/18  12:07         3,014,582 ad_pl_qa_03_17_203ans_multiple_answer_ct_new_elder_rate_v1.ipynb
2022/04/05  15:11         3,322,798 ad_pl_qa_04_05_r4_cross_table_v6_196ans_wo_dup_new_elder_rate.ipynb
2022/03/01  10:39             5,344 adpl_ind.xlsx
2022/04/20  13:35           472,442 ADPL_q37_372_KHCoder.pptx
2022/04/05  15:10           166,968 adpl_qa_ans_row_196_by_51_apr_05.xlsx
2022/03/16  14:30         2,176,895 adpl_qa_final_2.sav
2022/03/08  11:21         1,040,097 adpl_qa_summary_1.pptx
2022/03/10  16:31         2,635,343 adpl_qa_summary_cross_table.pptx
2022/03/10  16:31         2,635,342 adpl_qa_summary_cross_table_v1.pptx
2022/03/16  16:34         2,597,233 adpl_qa_summary_cross_table_v2_03_16.pptx
2022/03/17  10:19         2,555,744 adpl_qa_summary_cross_table_v3_03_17.pptx
2022/03/18  11:09         2,863,009 adpl_qa_summary_cross_table_v3_03_18.pptx
2022/03/01  10:39             5,278 adpl_year.xlsx
2021/01/06  09:53        34,391,374 AI_pdf.pdf
2022/10/12  09:27            30,904 AttendeeReport (1).xlsx
2022/10/17  11:48            27,088 AttendeeReport (2).xlsx
2022/02/04  16:31           790,521 california-housing-prices-regression-with-xgboost.ipynb
2022/02/04  16:36           790,521 california-housing-prices-regression-with-xgboost_02_04_r4.ipynb
2022/02/14  13:54             8,200 cols.xlsx
2022/12/16  10:36            20,976 create_sql_lookup_for_contest_winner2022_2015.ipynb
2022/03/31  14:33             4,885 cross_table_chi2_test.ipynb
2023/03/22  16:45            23,005 cross_table_visualization.xlsx
2023/03/22  14:56             2,162 ct4.csv
2022/05/06  13:37            50,110 D150BD20.xlsx
2022/04/14  15:02           220,769 DB636500
2022/02/14  13:25            60,423 df.xlsx
2022/02/22  11:44           147,344 df_all_q29_OHE.xlsx
2022/03/01  10:39            17,119 df_q11_OHE.xlsx
2022/03/01  10:39           348,793 dfall_ans.xlsx
2023/04/06  09:08         4,494,149 Downloads (6).zip
2022/03/25  14:44             7,593 dummy_df.xlsx
2022/03/25  15:09            10,742 dummy2_df.xlsx
2023/04/07  09:33            29,263 E566E640
2022/05/06  15:36            57,592 elder.xlsx
2022/05/06  16:24            19,652 elder_企業名_cleaning_to_input_sqllite.ipynb
2022/05/06  16:29            44,121 elder_企業名_cleaning_to_input_sqllite-Copy1.ipynb
2022/05/02  16:21            50,151 elder_掲載企業法人名_0502_v2.xlsx
2022/05/06  14:14            59,676 elder_掲載企業法人名_0506_v3.xlsx
2023/04/12  16:40           115,430 employment.png
2023/04/07  10:16           739,358 employment.svg
2023/04/12  16:39           116,832 employment_df1.png
2023/04/12  16:39           115,430 employment_df2.png
2023/04/10  14:46            28,639 employment_mean_min_max.svg
2023/04/12  14:55           400,521 employment1.png
2023/04/07  10:37           740,255 employment1.svg
2023/04/12  13:47           397,750 employment2.png
2023/04/07  10:37           739,358 employment2.svg
2023/04/12  14:56           384,673 employment3.png
2022/11/10  11:53             9,272 Excel.xlsx
2022/11/10  11:55             9,662 Excel2.xlsx
2022/03/08  08:50         1,101,727 final_アドバイザープランナーアンケートR4_03_08_r4.pdf
2022/03/25  16:33            13,443 google_form1.PNG
2022/03/25  16:04            12,284 google_forms.PNG
2019/09/22  01:36         1,423,529 housing.csv
2022/02/03  11:05           409,382 housing.csv.zip
2022/02/02  14:32           366,590 housing_price_xgboost_01_26_v1.ipynb
2022/02/02  14:33           366,590 housing_price_xgboost_02_02_v1.ipynb
2022/01/31  12:59         1,989,479 housing0131_v4.csv
2022/02/02  14:29         1,969,954 housing0202_v5.csv
2022/01/26  12:02         2,028,521 housing2.csv
2022/01/26  15:36         1,989,479 housing3.csv
2022/02/03  12:54         1,711,232 housing4.csv
2022/02/02  15:59               329 housing-main.zip
2022/02/07  16:36           249,856 hr_01.db
2022/08/26  15:17    <DIR>          hr_db
2022/05/26  16:20           225,280 HR_Employee_Attrition.db
2022/07/15  16:22            38,607 hr_test.csv
2023/03/23  15:42           130,157 img_hr.png
2022/03/17  16:17            14,789 join_list_revisit_study.ipynb
2021/12/28  14:57             3,122 list_naiho_expression_must_understand.ipynb
2022/11/17  16:43            26,030 mailaddr_132_wo_dup.xlsx
2022/12/26  13:43            36,944 matome_pandas_df_mamupulating.ipynb
2022/03/16  14:32            32,888 mecab_full_explained_01_11_2022.ipynb
2022/03/14  08:52            53,174 mecab_test_12_21-Copy1.ipynb
2022/01/11  14:27           168,374 mecab_test_12_27.ipynb
2023/01/12  16:03    <DIR>          moushikomi_r4
2022/03/25  16:15         3,307,304 multiple_answer_shuukei.ipynb
2022/12/14  14:15            45,688 MUST_Save_mouhikomi_data_all_r4.ipynb
2022/10/04  13:19           459,637 outlook_to_eml_file_conversion_101.pptx
2021/06/11  02:24         1,397,547 property_values_score.sav
2022/02/15  16:48            13,521 q21_OHE.xlsx
2022/03/01  10:39             6,026 q29_ans_list.xlsx
2022/02/22  11:37           147,344 q29_OHE.xlsx
2022/02/24  09:28         1,063,385 r4_02_24_171ans.pdf
2022/03/01  16:39    <DIR>          r4_03_01
2022/03/03  09:03    <DIR>          r4_03_03
2022/03/18  11:43    <DIR>          r4_03_14
2022/03/14  14:10           707,752 r4_03_14.zip
2022/03/18  13:42    <DIR>          r4_03_18
2022/12/16  10:02         1,540,539 read_csv_df_filter_concat_rename_truncate_value_count_df_pie_barplot.ipynb
2022/12/19  15:38           713,521 read_r3_61_245447by137_sns_plot_v1.ipynb
2022/12/27  09:01           163,313 read_r3_61_sql_select_here_document_12_26_R4.ipynb
2022/12/27  14:50           240,882 read_r3_61_突合_contest_winner_243_1227_2022.ipynb
2022/12/27  16:23           218,010 read_shokulab_突合_contest_winner_243_1227_2022.ipynb
2022/12/26  14:33           294,993 read_write_sql_excelz_csv_cols_divide_cols_matching_rename_cols_SQL_SELECT.ipynb
2022/04/05  15:58         7,383,672 ReliefJetEssentialsOutlook.exe
2022/10/06  12:09    <DIR>          renamed_dir
2022/11/29  14:04            36,147 SAVE_Final_teams_AttendeeReport_sumarize_1101_tokyo.ipynb
2022/12/01  14:29           305,869 SAVE_Final_teams_AttendeeReport_sumarize_1125_tokyo.ipynb
2022/12/08  16:02           180,021 SAVE_Final_teams_AttendeeReport_sumarize_1125_tokyo-Copy1.ipynb
2022/12/13  08:55           316,488 SAVE_Final_teams_AttendeeReport_sumarize_1206.ipynb
2022/12/01  11:58            58,001 SAVE_FOR_USE_132_MAIL_CREATION_Dec_01_r4_no_yakushoku.ipynb
2022/04/12  13:31            20,418 SAVE_FOR_USE_EACH_MAIL_CREATION_APR_05_V1.ipynb
2022/11/09  13:55            14,744 SAVE_FOR_USE_EACH_MAIL_CREATION_NOV_08_v3_yakushoku.ipynb
2022/10/07  10:26            15,601 SAVE_FOR_USE_EACH_MAIL_CREATION_OCT_04_v1.ipynb
2022/11/08  13:20            15,711 SAVE_FOR_USE_EACH_MAIL_CREATION_OCT_04_v2_yakushoku.ipynb
2022/10/20  14:13           373,425 SAVE_teams_AttendeeReport_sumarize_1017_v3.ipynb
2022/03/01  10:39             5,126 sogyo_shien.xlsx
2022/03/30  16:56             2,327 spss_cross_basic.sav
2022/03/30  14:40            12,564 spss_cross_basic.xlsx
2022/03/30  16:56               691 spss_cross_basic2.sav
2022/03/30  14:42             8,441 spss_cross_basic2.xlsx
2022/03/08  16:46         1,993,020 spss_mecab_memo_0307_v8_2022.pptx
2022/03/10  13:01         2,155,416 spss_mecab_memo_0310_v9_2022.pptx
2022/03/25  16:44         2,368,118 spss_mecab_memo_0316_v10_2022.pptx
2022/02/07  15:11           120,511 sql_01_20_r4.ipynb
2023/03/08  11:10    <DIR>          sqliteDB
2022/01/24  13:11    <DIR>          T_SHOKUBA_20220119
2022/01/21  16:49        23,797,025 T_SHOKUBA_20220119.zip
2022/03/01  10:39             5,703 tdfk.xlsx
2022/03/03  13:16               524 tdfk_ID.txt
2022/10/14  16:46           136,875 teams_AttendeeReport_manipulate_1012_2022_v1.ipynb
2022/10/17  10:35           347,053 teams_AttendeeReport_manipulate_1017_2022_v1 (3).ipynb
2021/06/11  02:24           121,115 telco.sav
2021/06/11  02:24            62,928 telco_Feb.sav
2021/06/11  02:24            62,320 telco_Jan.sav
2021/11/02  16:12                 0 test.text
2022/05/06  16:02            57,477 test2.xlsx
2022/03/10  11:02            87,361 value_counts_visualization_03_10.ipynb
2022/08/26  15:16           227,974 WA_Fn-UseC_-HR-Employee-Attrition.csv
2022/04/21  16:34           632,372 wordcloud_04_19_2022_v1.pptx
2022/01/21  15:41         2,028,526 xgb.csv
2022/04/14  14:57           220,815 マスターリスト_132_wo_dup_url_w_61_w_ID_04_14_2022_v25.xlsm
2023/03/31  15:21    <DIR>          ローデータ
2022/03/18  11:32         3,456,179 活動支援ツールと啓発に関するニーズ調査_クロス集計_WordCloud_総務省統計反映_196_03_18.pptx
2022/03/18  13:41         3,885,088 活動支援ツールと啓発に関するニーズ調査_クロス集計_WordCloud_総務省統計反映_196_03_18_v1.pptx
2022/03/18  14:27         3,836,388 活動支援ツールと啓発に関するニーズ調査_クロス集計_WordCloud_総務省統計反映_196_03_18_v2.pptx
2022/08/04  11:12            18,774 基準データ.xlsx
2023/04/07  09:22            29,712 基準データ_2023.xlsx
2023/04/07  09:59            31,163 基準データ2.xlsx
2023/04/07  10:03            30,932 基準データ2023.xlsx
2023/04/07  09:35            29,248 基準データ22.xlsx
2022/08/04  11:49           513,441 雇用力基準データ.pptx
2022/08/05  13:43           677,937 雇用力基準データ_08_05_v2.pptx
2023/04/07  13:05         1,857,261 雇用力調査0404_r4_q2q.pptx
2023/04/07  13:43         1,979,448 雇用力調査0407_r4_final.pptx
2023/04/07  16:43         1,957,465 雇用力調査0407_r4_final2.pptx
2023/04/07  13:06         1,857,260 雇用力調査0407_r4_q2q.pptx
2023/04/10  15:41         2,969,310 雇用力調査0410_r4_final3.pptx
2023/03/31  15:21    <DIR>          集計表
             164 個のファイル         170,683,237 バイト
              14 個のディレクトリ  207,676,071,936 バイトの空き領域
In [8]:
#xlsx = pd.ExcelFile('r3_61_245447by135_v2.xlsx')
#df = pd.read_excel(xlsx, '05_23', index_col=None, header=0)
In [ ]:
 
In [9]:
import codecs
with codecs.open("WA_Fn-UseC_-HR-Employee-Attrition.csv", mode ="r", encoding ="utf-8", errors="ignore") as file:
    df = pd.read_csv(file, delimiter =",", header=0)
In [10]:
df
Out[10]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 36 No Travel_Frequently 884 Research & Development 23 2 Medical 1 2061 ... 3 80 1 17 3 3 5 2 0 3
1466 39 No Travel_Rarely 613 Research & Development 6 1 Medical 1 2062 ... 1 80 1 9 5 3 7 7 1 7
1467 27 No Travel_Rarely 155 Research & Development 4 3 Life Sciences 1 2064 ... 2 80 1 6 0 3 6 2 0 3
1468 49 No Travel_Frequently 1023 Sales 2 3 Medical 1 2065 ... 4 80 0 17 3 2 9 6 0 8
1469 34 No Travel_Rarely 628 Research & Development 8 3 Medical 1 2068 ... 1 80 0 6 3 4 4 3 1 2

1470 rows × 35 columns

In [11]:
dfbk = df
In [12]:
df = dfbk
In [13]:
for col in df.columns:
    print(col, len(df[col].unique()), df[col].unique())
Age 43 [41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60]
Attrition 2 ['Yes' 'No']
BusinessTravel 3 ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
DailyRate 886 [1102  279 1373 1392  591 1005 1324 1358  216 1299  809  153  670 1346
  103 1389  334 1123 1219  371  673 1218  419  391  699 1282 1125  691
  477  705  924 1459  125  895  813 1273  869  890  852 1141  464 1240
 1357  994  721 1360 1065  408 1211 1229  626 1434 1488 1097 1443  515
  853 1142  655 1115  427  653  989 1435 1223  836 1195 1339  664  318
 1225 1328 1082  548  132  746  776  193  397  945 1214  111  573 1153
 1400  541  432  288  669  530  632 1334  638 1093 1217 1353  120  682
  489  807  827  871  665 1040 1420  240 1280  534 1456  658  142 1127
 1031 1189 1354 1467  922  394 1312  750  441  684  249  841  147  528
  594  470  957  542  802 1355 1150 1329  959 1033 1316  364  438  689
  201 1427  857  933 1181 1395  662 1436  194  967 1496 1169 1145  630
  303 1256  440 1450 1452  465  702 1157  602 1480 1268  713  134  526
 1380  140  629 1356  328 1084  931  692 1069  313  894  556 1344  290
  138  926 1261  472 1002  878  905 1180  121 1136  635 1151  644 1045
  829 1242 1469  896  992 1052 1147 1396  663  119  979  319 1413  944
 1323  532  818  854 1034  771 1401 1431  976 1411 1300  252 1327  832
 1017 1199  504  505  916 1247  685  269 1416  833  307 1311  128  488
  529 1210 1463  675 1385 1403  452  666 1158  228  996  728 1315  322
 1479  797 1070  442  496 1372  920  688 1449 1117  636  506  444  950
  889  555  230 1232  566 1302  812 1476  218 1132 1105  906  849  390
  106 1249  192  553  117  185 1091  723 1220  588 1377 1018 1275  798
  672 1162  508 1482  559  210  928 1001  549 1124  738  570 1130 1192
  343  144 1296 1309  483  810  544 1062 1319  641 1332  756  845  593
 1171  350  921 1144  143 1046  575  156 1283  755  304 1178  329 1362
 1371  202  253  164 1107  759 1305  982  821 1381  480 1473  891 1063
  645 1490  317  422 1485 1368 1448  296 1398 1349  986 1099 1116 1499
  983 1009 1303 1274 1277  587  413 1276  988 1474  163  267  619  302
  443  828  561  426  232 1306 1094  509  775  195  258  471  799  956
  535 1495  446 1245  703  823 1246  622 1287  448  254 1365  538  525
  558  782  362 1236 1112  204 1343  604 1216  646  160  238 1397  306
  991  482 1176  913 1076  727  885  243  806  817 1410 1207 1442  693
  929  562  608  580  970 1179  294  314  316  654  168  381  217  501
  650  141  804  975 1090  346  430  268  167  621  527  883  954  310
  719  725  715  657 1146  182  376  571  384  791 1111 1243 1092 1325
  805  213  118  676 1252  286 1258  932 1041  859  720  946 1184  436
  589  760  887 1318  625  180  586 1012  661  930  342 1230 1271 1278
  607  130  300  583 1418 1269  379  395 1265 1222  341  868 1231  102
  881 1383 1075  374 1086  781  177  500 1425 1454  617 1085  995 1122
  618  546  462 1198 1272  154 1137 1188  188 1333  867  263  938  129
  616  498 1404 1053  289 1376  231  152  882  903 1379  335  722  461
  974 1126  840 1134  248  955  939 1391 1206  287 1441  109 1066  277
  466 1055  265  135  247 1035  266  145 1038 1234 1109 1089  788  124
  660 1186 1464  796  415  769 1003 1366  330 1492 1204  309 1330  469
  697 1262 1050  770  406  203 1308  984  439  793 1451 1182  174  490
  718  433  773  603  874  367  199  481  647 1384  902  819  862 1457
  977  942 1402 1421 1361  917  200  150  179  696  116  363  107 1465
  458 1212 1103  966 1010  326 1098  969 1167  694 1320  536  373  599
  251  131  237 1429  648  735  531  429  968  879  640  412  848  360
 1138  325 1322  299 1030  634  524  256 1060  935  495  282  206  943
  523  507  601  855 1291 1405 1369  999 1202  285  404  736 1498 1200
 1439  499  205  683 1462  949  652  332 1475  337  971 1174  667  560
  172  383 1255  359  401  377  592 1445 1221  866  981  447 1326  748
  990  405  115  790  830 1193 1423  467  271  410 1083  516  224  136
 1029  333 1440  674 1342  898  824  492  598  740  888 1288  104 1108
  479 1351  474  437  884 1370  264 1059  563  457 1313  241 1015  336
 1387  170  208  671  711  737 1470  365  763  567  486  772  301  311
  584  880  392  148  708 1259  786  370  678  146  581  918 1238  585
  741  552  369  717  543  964  792  611  176  897  600 1054  428  181
  211 1079  590  305  953  478 1375  244  511 1294  196  734 1239 1253
 1128 1336  234  766  261 1194  431  572 1422 1297  574  355  207  706
  280  726  414  352 1224  459 1254 1131  835 1172 1266  783  219 1213
 1096 1251 1394  605 1064 1337  937  157  754 1168  155 1444  189  911
 1321 1154  557  642  801  161 1382 1037  105  582  704  345 1120 1378
  468  613 1023  628]
Department 3 ['Sales' 'Research & Development' 'Human Resources']
DistanceFromHome 29 [ 1  8  2  3 24 23 27 16 15 26 19 21  5 11  9  7  6 10  4 25 12 18 29 22
 14 20 28 17 13]
Education 5 [2 1 4 3 5]
EducationField 6 ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
EmployeeCount 1 [1]
EmployeeNumber 1470 [   1    2    4 ... 2064 2065 2068]
EnvironmentSatisfaction 4 [2 3 4 1]
Gender 2 ['Female' 'Male']
HourlyRate 71 [ 94  61  92  56  40  79  81  67  44  84  49  31  93  50  51  80  96  78
  45  82  53  83  58  72  48  42  41  86  97  75  33  37  73  98  36  47
  71  30  43  99  59  95  57  76  87  66  55  32  52  70  62  64  63  60
 100  46  39  77  35  91  54  34  90  65  88  85  89  68  69  74  38]
JobInvolvement 4 [3 2 4 1]
JobLevel 5 [2 1 3 4 5]
JobRole 9 ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
JobSatisfaction 4 [4 2 3 1]
MaritalStatus 3 ['Single' 'Married' 'Divorced']
MonthlyIncome 1349 [5993 5130 2090 ... 9991 5390 4404]
MonthlyRate 1427 [19479 24907  2396 ...  5174 13243 10228]
NumCompaniesWorked 10 [8 1 6 9 0 4 5 2 7 3]
Over18 1 ['Y']
OverTime 2 ['Yes' 'No']
PercentSalaryHike 15 [11 23 15 12 13 20 22 21 17 14 16 18 19 24 25]
PerformanceRating 2 [3 4]
RelationshipSatisfaction 4 [1 4 2 3]
StandardHours 1 [80]
StockOptionLevel 4 [0 1 3 2]
TotalWorkingYears 40 [ 8 10  7  6 12  1 17  5  3 31 13  0 26 24 22  9 19  2 23 14 15  4 29 28
 21 25 20 11 16 37 38 30 40 18 36 34 32 33 35 27]
TrainingTimesLastYear 7 [0 3 2 5 1 4 6]
WorkLifeBalance 4 [1 3 2 4]
YearsAtCompany 37 [ 6 10  0  8  2  7  1  9  5  4 25  3 12 14 22 15 27 21 17 11 13 37 16 20
 40 24 33 19 36 18 29 31 32 34 26 30 23]
YearsInCurrentRole 19 [ 4  7  0  2  5  9  8  3  6 13  1 15 14 16 11 10 12 18 17]
YearsSinceLastPromotion 16 [ 0  1  3  2  7  4  8  6  5 15  9 13 12 10 11 14]
YearsWithCurrManager 18 [ 5  7  0  2  6  8  3 11 17  1  4 12  9 10 15 13 16 14]
In [14]:
## Check for unique values of categorical variables
df_overview = pd.DataFrame([[i, len(df[i].unique()), df[i].dtypes, df[i].isnull().sum()] for i in df.columns], 
                          columns=['Feature', 'Unique Values', 'dtypes', 'NaN']).set_index('Feature')
print(df.shape)
df_overview
(1470, 35)
Out[14]:
Unique Values dtypes NaN
Feature
Age 43 int64 0
Attrition 2 object 0
BusinessTravel 3 object 0
DailyRate 886 int64 0
Department 3 object 0
DistanceFromHome 29 int64 0
Education 5 int64 0
EducationField 6 object 0
EmployeeCount 1 int64 0
EmployeeNumber 1470 int64 0
EnvironmentSatisfaction 4 int64 0
Gender 2 object 0
HourlyRate 71 int64 0
JobInvolvement 4 int64 0
JobLevel 5 int64 0
JobRole 9 object 0
JobSatisfaction 4 int64 0
MaritalStatus 3 object 0
MonthlyIncome 1349 int64 0
MonthlyRate 1427 int64 0
NumCompaniesWorked 10 int64 0
Over18 1 object 0
OverTime 2 object 0
PercentSalaryHike 15 int64 0
PerformanceRating 2 int64 0
RelationshipSatisfaction 4 int64 0
StandardHours 1 int64 0
StockOptionLevel 4 int64 0
TotalWorkingYears 40 int64 0
TrainingTimesLastYear 7 int64 0
WorkLifeBalance 4 int64 0
YearsAtCompany 37 int64 0
YearsInCurrentRole 19 int64 0
YearsSinceLastPromotion 16 int64 0
YearsWithCurrManager 18 int64 0
In [15]:
df = df.iloc[:, [0,1,2,4,5,6,7,11,8,9]]
In [16]:
## Check for unique values of categorical variables
df_overview = pd.DataFrame([[i, len(df[i].unique()), df[i].dtypes, df[i].isnull().sum()] for i in df.columns], 
                          columns=['Feature', 'Unique Values', 'dtypes', 'NaN']).set_index('Feature')
print(df.shape)
df_overview
(1470, 10)
Out[16]:
Unique Values dtypes NaN
Feature
Age 43 int64 0
Attrition 2 object 0
BusinessTravel 3 object 0
Department 3 object 0
DistanceFromHome 29 int64 0
Education 5 int64 0
EducationField 6 object 0
Gender 2 object 0
EmployeeCount 1 int64 0
EmployeeNumber 1470 int64 0
In [41]:
for col in df.columns:
    print(col, len(df[col].unique()), df[col].unique())
Age 43 [41 49 37 33 27 32 59 30 38 36 35 29 31 34 28 22 53 24 21 42 44 46 39 43
 50 26 48 55 45 56 23 51 40 54 58 20 25 19 57 52 47 18 60]
Attrition 2 ['Yes' 'No']
BusinessTravel 3 ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Department 3 ['Sales' 'Research & Development' 'Human Resources']
DistanceFromHome 29 [ 1  8  2  3 24 23 27 16 15 26 19 21  5 11  9  7  6 10  4 25 12 18 29 22
 14 20 28 17 13]
Education 5 [2 1 4 3 5]
EducationField 6 ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Gender 2 ['Female' 'Male']
EmployeeCount 1 [1]
EmployeeNumber 1470 [   1    2    4 ... 2064 2065 2068]
In [42]:
df.head()
Out[42]:
Age Attrition BusinessTravel Department DistanceFromHome Education EducationField Gender EmployeeCount EmployeeNumber
0 41 Yes Travel_Rarely Sales 1 2 Life Sciences Female 1 1
1 49 No Travel_Frequently Research & Development 8 1 Life Sciences Male 1 2
2 37 Yes Travel_Rarely Research & Development 2 2 Other Male 1 4
3 33 No Travel_Frequently Research & Development 3 4 Life Sciences Female 1 5
4 27 No Travel_Rarely Research & Development 2 1 Medical Male 1 7
In [43]:
df['EducationField'].value_counts()
Out[43]:
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: EducationField, dtype: int64
In [44]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1470 non-null   int64 
 1   Attrition         1470 non-null   object
 2   BusinessTravel    1470 non-null   object
 3   Department        1470 non-null   object
 4   DistanceFromHome  1470 non-null   int64 
 5   Education         1470 non-null   int64 
 6   EducationField    1470 non-null   object
 7   Gender            1470 non-null   object
 8   EmployeeCount     1470 non-null   int64 
 9   EmployeeNumber    1470 non-null   int64 
dtypes: int64(5), object(5)
memory usage: 115.0+ KB
In [45]:
df.describe()
Out[45]:
Age DistanceFromHome Education EmployeeCount EmployeeNumber
count 1470.000000 1470.000000 1470.000000 1470.0 1470.000000
mean 36.923810 9.192517 2.912925 1.0 1024.865306
std 9.135373 8.106864 1.024165 0.0 602.024335
min 18.000000 1.000000 1.000000 1.0 1.000000
25% 30.000000 2.000000 2.000000 1.0 491.250000
50% 36.000000 7.000000 3.000000 1.0 1020.500000
75% 43.000000 14.000000 4.000000 1.0 1555.750000
max 60.000000 29.000000 5.000000 1.0 2068.000000
In [ ]:
 
In [ ]:
 
In [15]:
plt.rcParams['figure.figsize'] = 10, 6
In [16]:
col_name = 'YearsAtCompany'
sns.countplot(x=col_name, data=df, palette='hls')
#seaborn countplotを設定し、変数axとしてAnnotationを上書できるようにする
ax = sns.countplot(x = col_name, 
                   data = df)
#Annotation を設定する
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., 
                    p.get_height()), 
                    ha = 'center', 
                    va = 'center', 
                    xytext = (0, 9), 
                    textcoords = 'offset points',
                    fontsize = 8,
                    color = 'blue')
In [17]:
df[col_name].describe()
Out[17]:
count    1470.000000
mean        7.008163
std         6.126525
min         0.000000
25%         3.000000
50%         5.000000
75%         9.000000
max        40.000000
Name: YearsAtCompany, dtype: float64
In [18]:
col_name = 'TotalWorkingYears'
sns.countplot(x=col_name, data=df, palette='hls')
#seaborn countplotを設定し、変数axとしてAnnotationを上書できるようにする
ax = sns.countplot(x = col_name, 
                   data = df)
#Annotation を設定する
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., 
                    p.get_height()), 
                    ha = 'center', 
                    va = 'center', 
                    xytext = (0, 9), 
                    textcoords = 'offset points',
                    fontsize = 8,
                    color = 'blue')
In [19]:
df[col_name].describe()
Out[19]:
count    1470.000000
mean       11.279592
std         7.780782
min         0.000000
25%         6.000000
50%        10.000000
75%        15.000000
max        40.000000
Name: TotalWorkingYears, dtype: float64
In [20]:
col_name = 'Age'
sns.countplot(x=col_name, data=df, palette='hls')
#seaborn countplotを設定し、変数axとしてAnnotationを上書できるようにする
ax = sns.countplot(x = col_name, 
                   data = df)
#Annotation を設定する
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., 
                    p.get_height()), 
                    ha = 'center', 
                    va = 'center', 
                    xytext = (0, 9), 
                    textcoords = 'offset points',
                    fontsize = 8,
                    color = 'blue')
In [21]:
df[col_name].describe()
Out[21]:
count    1470.000000
mean       36.923810
std         9.135373
min        18.000000
25%        30.000000
50%        36.000000
75%        43.000000
max        60.000000
Name: Age, dtype: float64
In [ ]:
 
In [22]:
x_name = 'Age'
y_name = 'MonthlyRate'
sns.lineplot(data=df, x=x_name, y=y_name)
Out[22]:
<AxesSubplot: xlabel='Age', ylabel='MonthlyRate'>
In [23]:
# 1,000ドル単位に下3桁で切り捨てて、それを列名'MonthlyRateに格納します
df['MonthlyRate'] = df['MonthlyIncome'].round(-3)
# MonthlyRateの度数分布を作成します
sns.countplot(x='MonthlyRate', data=df, palette='hls')
Out[23]:
<AxesSubplot: xlabel='MonthlyRate', ylabel='count'>
In [24]:
ct = pd.crosstab(df['Age'], df['MonthlyRate'])
ct
Out[24]:
MonthlyRate 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000
Age
18 3 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 2 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
20 1 2 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 2 3 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 0 6 7 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 1 7 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
24 0 8 7 8 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
25 1 4 3 9 5 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0
26 0 11 11 8 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
27 0 13 11 9 4 5 2 0 2 1 0 0 1 0 0 0 0 0 0 0
28 1 12 14 5 8 1 3 2 1 1 0 0 0 0 0 0 0 0 0 0
29 1 16 15 10 6 5 4 5 2 2 0 1 0 0 0 1 0 0 0 0
30 2 11 11 7 10 4 6 2 2 3 1 1 0 0 0 0 0 0 0 0
31 3 6 13 10 8 7 8 3 1 4 3 1 0 1 0 1 0 0 0 0
32 1 7 12 9 13 6 4 0 2 5 2 0 0 0 0 0 0 0 0 0
33 0 8 14 6 11 5 3 3 1 1 0 2 1 1 0 1 1 0 0 0
34 0 8 15 11 16 9 5 3 2 5 0 2 0 0 0 0 1 0 0 0
35 1 12 9 12 14 6 4 4 6 7 2 1 0 0 0 0 0 0 0 0
36 0 5 13 5 13 10 7 6 3 6 0 1 0 0 0 0 0 0 0 0
37 0 4 7 13 3 8 4 1 2 3 1 1 0 3 0 0 0 0 0 0
38 0 6 11 6 8 8 5 3 3 3 1 2 1 1 0 0 0 0 0 0
39 0 3 1 6 8 5 1 3 0 2 3 2 3 0 0 0 2 0 3 0
40 0 8 5 8 5 6 4 2 1 4 2 0 3 2 0 1 1 1 2 2
41 0 5 5 2 4 5 3 2 3 1 0 0 0 2 0 1 2 0 3 2
42 0 2 7 6 9 4 4 0 0 1 0 0 2 2 0 1 1 4 2 1
43 0 5 1 2 6 3 1 2 0 2 2 0 0 1 0 1 3 1 1 1
44 0 6 7 1 4 4 0 2 0 3 0 0 1 0 0 1 0 1 2 1
45 0 5 2 6 6 5 2 0 2 2 3 0 2 0 0 0 2 2 2 0
46 0 0 1 0 4 2 2 1 4 4 3 0 1 0 0 0 4 2 4 1
47 0 1 3 1 7 1 0 0 1 1 0 2 1 1 0 1 2 1 0 1
48 0 2 2 2 2 1 0 2 0 0 1 0 1 0 2 0 2 1 0 1
49 0 0 2 2 4 0 4 1 0 1 1 0 1 2 1 1 0 0 3 1
50 0 1 2 2 1 1 2 0 1 1 2 0 1 2 0 0 3 5 4 2
51 0 2 2 0 3 1 1 0 0 0 2 1 2 2 0 1 0 0 0 2
52 0 0 4 1 1 1 0 2 0 1 0 0 1 1 0 0 3 0 1 2
53 0 2 0 1 0 0 1 2 0 2 1 1 1 1 3 0 1 1 2 0
54 0 0 1 2 2 1 1 0 0 0 4 0 0 1 0 1 3 1 1 0
55 0 0 1 2 2 1 0 0 0 1 1 0 0 2 2 0 3 0 4 3
56 0 2 2 1 2 1 1 0 0 0 1 0 2 0 0 0 0 0 0 2
57 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0
58 0 3 1 0 0 1 0 0 0 2 1 0 0 1 0 2 0 1 1 1
59 0 2 1 0 2 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0
60 0 0 0 0 2 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1
In [25]:
# figsizeを変更するモジュール rcParsmsを読み込みます
from pylab import rcParams
# figsize を24 x 13インチにします
rcParams['figure.figsize'] = 24,13
# クロス集計表のデータフレーム ct のヒートマップを作成します。
# 色だけでなく、度数も表示するため、annot = True を指定します
sns.heatmap(ct, annot=True, cmap='BuGn')
# カレントディレクトリにヒートマップ図をimg_hr.pngとして保存します
plt.savefig("img_hr.png")
In [26]:
pwd
Out[26]:
'D:\\jupyter'
In [27]:
import matplotlib.pyplot as plt
import numpy as np

x = np.random.rand(1000)
y = np.random.rand(1000)
z = np.random.rand(1000)

fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(x, y, z, marker=".", color='red')

plt.show()
In [28]:
ct
Out[28]:
MonthlyRate 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000
Age
18 3 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 2 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
20 1 2 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 2 3 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 0 6 7 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 1 7 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
24 0 8 7 8 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
25 1 4 3 9 5 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0
26 0 11 11 8 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
27 0 13 11 9 4 5 2 0 2 1 0 0 1 0 0 0 0 0 0 0
28 1 12 14 5 8 1 3 2 1 1 0 0 0 0 0 0 0 0 0 0
29 1 16 15 10 6 5 4 5 2 2 0 1 0 0 0 1 0 0 0 0
30 2 11 11 7 10 4 6 2 2 3 1 1 0 0 0 0 0 0 0 0
31 3 6 13 10 8 7 8 3 1 4 3 1 0 1 0 1 0 0 0 0
32 1 7 12 9 13 6 4 0 2 5 2 0 0 0 0 0 0 0 0 0
33 0 8 14 6 11 5 3 3 1 1 0 2 1 1 0 1 1 0 0 0
34 0 8 15 11 16 9 5 3 2 5 0 2 0 0 0 0 1 0 0 0
35 1 12 9 12 14 6 4 4 6 7 2 1 0 0 0 0 0 0 0 0
36 0 5 13 5 13 10 7 6 3 6 0 1 0 0 0 0 0 0 0 0
37 0 4 7 13 3 8 4 1 2 3 1 1 0 3 0 0 0 0 0 0
38 0 6 11 6 8 8 5 3 3 3 1 2 1 1 0 0 0 0 0 0
39 0 3 1 6 8 5 1 3 0 2 3 2 3 0 0 0 2 0 3 0
40 0 8 5 8 5 6 4 2 1 4 2 0 3 2 0 1 1 1 2 2
41 0 5 5 2 4 5 3 2 3 1 0 0 0 2 0 1 2 0 3 2
42 0 2 7 6 9 4 4 0 0 1 0 0 2 2 0 1 1 4 2 1
43 0 5 1 2 6 3 1 2 0 2 2 0 0 1 0 1 3 1 1 1
44 0 6 7 1 4 4 0 2 0 3 0 0 1 0 0 1 0 1 2 1
45 0 5 2 6 6 5 2 0 2 2 3 0 2 0 0 0 2 2 2 0
46 0 0 1 0 4 2 2 1 4 4 3 0 1 0 0 0 4 2 4 1
47 0 1 3 1 7 1 0 0 1 1 0 2 1 1 0 1 2 1 0 1
48 0 2 2 2 2 1 0 2 0 0 1 0 1 0 2 0 2 1 0 1
49 0 0 2 2 4 0 4 1 0 1 1 0 1 2 1 1 0 0 3 1
50 0 1 2 2 1 1 2 0 1 1 2 0 1 2 0 0 3 5 4 2
51 0 2 2 0 3 1 1 0 0 0 2 1 2 2 0 1 0 0 0 2
52 0 0 4 1 1 1 0 2 0 1 0 0 1 1 0 0 3 0 1 2
53 0 2 0 1 0 0 1 2 0 2 1 1 1 1 3 0 1 1 2 0
54 0 0 1 2 2 1 1 0 0 0 4 0 0 1 0 1 3 1 1 0
55 0 0 1 2 2 1 0 0 0 1 1 0 0 2 2 0 3 0 4 3
56 0 2 2 1 2 1 1 0 0 0 1 0 2 0 0 0 0 0 0 2
57 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0
58 0 3 1 0 0 1 0 0 0 2 1 0 0 1 0 2 0 1 1 1
59 0 2 1 0 2 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0
60 0 0 0 0 2 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1
In [29]:
ct.reset_index()
Out[29]:
MonthlyRate Age 1000 2000 3000 4000 5000 6000 7000 8000 9000 ... 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000
0 18 3 5 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 19 2 4 3 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 20 1 2 8 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 21 2 3 8 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 22 0 6 7 2 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 23 1 7 4 2 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 24 0 8 7 8 3 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 25 1 4 3 9 5 3 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
8 26 0 11 11 8 6 3 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 27 0 13 11 9 4 5 2 0 2 ... 0 0 1 0 0 0 0 0 0 0
10 28 1 12 14 5 8 1 3 2 1 ... 0 0 0 0 0 0 0 0 0 0
11 29 1 16 15 10 6 5 4 5 2 ... 0 1 0 0 0 1 0 0 0 0
12 30 2 11 11 7 10 4 6 2 2 ... 1 1 0 0 0 0 0 0 0 0
13 31 3 6 13 10 8 7 8 3 1 ... 3 1 0 1 0 1 0 0 0 0
14 32 1 7 12 9 13 6 4 0 2 ... 2 0 0 0 0 0 0 0 0 0
15 33 0 8 14 6 11 5 3 3 1 ... 0 2 1 1 0 1 1 0 0 0
16 34 0 8 15 11 16 9 5 3 2 ... 0 2 0 0 0 0 1 0 0 0
17 35 1 12 9 12 14 6 4 4 6 ... 2 1 0 0 0 0 0 0 0 0
18 36 0 5 13 5 13 10 7 6 3 ... 0 1 0 0 0 0 0 0 0 0
19 37 0 4 7 13 3 8 4 1 2 ... 1 1 0 3 0 0 0 0 0 0
20 38 0 6 11 6 8 8 5 3 3 ... 1 2 1 1 0 0 0 0 0 0
21 39 0 3 1 6 8 5 1 3 0 ... 3 2 3 0 0 0 2 0 3 0
22 40 0 8 5 8 5 6 4 2 1 ... 2 0 3 2 0 1 1 1 2 2
23 41 0 5 5 2 4 5 3 2 3 ... 0 0 0 2 0 1 2 0 3 2
24 42 0 2 7 6 9 4 4 0 0 ... 0 0 2 2 0 1 1 4 2 1
25 43 0 5 1 2 6 3 1 2 0 ... 2 0 0 1 0 1 3 1 1 1
26 44 0 6 7 1 4 4 0 2 0 ... 0 0 1 0 0 1 0 1 2 1
27 45 0 5 2 6 6 5 2 0 2 ... 3 0 2 0 0 0 2 2 2 0
28 46 0 0 1 0 4 2 2 1 4 ... 3 0 1 0 0 0 4 2 4 1
29 47 0 1 3 1 7 1 0 0 1 ... 0 2 1 1 0 1 2 1 0 1
30 48 0 2 2 2 2 1 0 2 0 ... 1 0 1 0 2 0 2 1 0 1
31 49 0 0 2 2 4 0 4 1 0 ... 1 0 1 2 1 1 0 0 3 1
32 50 0 1 2 2 1 1 2 0 1 ... 2 0 1 2 0 0 3 5 4 2
33 51 0 2 2 0 3 1 1 0 0 ... 2 1 2 2 0 1 0 0 0 2
34 52 0 0 4 1 1 1 0 2 0 ... 0 0 1 1 0 0 3 0 1 2
35 53 0 2 0 1 0 0 1 2 0 ... 1 1 1 1 3 0 1 1 2 0
36 54 0 0 1 2 2 1 1 0 0 ... 4 0 0 1 0 1 3 1 1 0
37 55 0 0 1 2 2 1 0 0 0 ... 1 0 0 2 2 0 3 0 4 3
38 56 0 2 2 1 2 1 1 0 0 ... 1 0 2 0 0 0 0 0 0 2
39 57 0 0 0 0 1 0 1 0 1 ... 0 0 0 1 0 0 0 0 0 0
40 58 0 3 1 0 0 1 0 0 0 ... 1 0 0 1 0 2 0 1 1 1
41 59 0 2 1 0 2 0 0 1 0 ... 1 1 0 1 0 0 0 0 1 0
42 60 0 0 0 0 2 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 1

43 rows × 21 columns

In [ ]:
 
In [30]:
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

df = pd.DataFrame({'A':[0,1,2,3,4],
                   'B':[0,1,4,9,16],
                   'C':[0,1,8,27,64]})
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(df.A, df.B, df.C, s=100)
In [31]:
df
Out[31]:
A B C
0 0 0 0
1 1 1 1
2 2 4 8
3 3 9 27
4 4 16 64
In [32]:
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

df = pd.DataFrame({'A':[0,1,2,3,4],
                   'B':[0,1,4,9,16],
                   'C':[0,1,8,27,64]})

#文字だとプロットできないのでcolumn名を数字に変更
df.rename(columns={'A':1, 'B':2, 'C':3}, inplace=True)

X = np.array([])
Y = np.array([])
Z = np.array([])

for i in range(df.index.size):
    X = np.concatenate([X, np.full(df.columns.size, df.index[i])], 0)

for i in range(df.index.size):
    Y = np.concatenate([Y, np.array(df.columns)], 0)

for i in range(df.index.size):
    Z = np.concatenate([Z, np.array(df[i:i+1])[0]], 0)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(X, Y, Z)
In [33]:
df
Out[33]:
1 2 3
0 0 0 0
1 1 1 1
2 2 4 8
3 3 9 27
4 4 16 64
In [34]:
ct
Out[34]:
MonthlyRate 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000
Age
18 3 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 2 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
20 1 2 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 2 3 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 0 6 7 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
23 1 7 4 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
24 0 8 7 8 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
25 1 4 3 9 5 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0
26 0 11 11 8 6 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0
27 0 13 11 9 4 5 2 0 2 1 0 0 1 0 0 0 0 0 0 0
28 1 12 14 5 8 1 3 2 1 1 0 0 0 0 0 0 0 0 0 0
29 1 16 15 10 6 5 4 5 2 2 0 1 0 0 0 1 0 0 0 0
30 2 11 11 7 10 4 6 2 2 3 1 1 0 0 0 0 0 0 0 0
31 3 6 13 10 8 7 8 3 1 4 3 1 0 1 0 1 0 0 0 0
32 1 7 12 9 13 6 4 0 2 5 2 0 0 0 0 0 0 0 0 0
33 0 8 14 6 11 5 3 3 1 1 0 2 1 1 0 1 1 0 0 0
34 0 8 15 11 16 9 5 3 2 5 0 2 0 0 0 0 1 0 0 0
35 1 12 9 12 14 6 4 4 6 7 2 1 0 0 0 0 0 0 0 0
36 0 5 13 5 13 10 7 6 3 6 0 1 0 0 0 0 0 0 0 0
37 0 4 7 13 3 8 4 1 2 3 1 1 0 3 0 0 0 0 0 0
38 0 6 11 6 8 8 5 3 3 3 1 2 1 1 0 0 0 0 0 0
39 0 3 1 6 8 5 1 3 0 2 3 2 3 0 0 0 2 0 3 0
40 0 8 5 8 5 6 4 2 1 4 2 0 3 2 0 1 1 1 2 2
41 0 5 5 2 4 5 3 2 3 1 0 0 0 2 0 1 2 0 3 2
42 0 2 7 6 9 4 4 0 0 1 0 0 2 2 0 1 1 4 2 1
43 0 5 1 2 6 3 1 2 0 2 2 0 0 1 0 1 3 1 1 1
44 0 6 7 1 4 4 0 2 0 3 0 0 1 0 0 1 0 1 2 1
45 0 5 2 6 6 5 2 0 2 2 3 0 2 0 0 0 2 2 2 0
46 0 0 1 0 4 2 2 1 4 4 3 0 1 0 0 0 4 2 4 1
47 0 1 3 1 7 1 0 0 1 1 0 2 1 1 0 1 2 1 0 1
48 0 2 2 2 2 1 0 2 0 0 1 0 1 0 2 0 2 1 0 1
49 0 0 2 2 4 0 4 1 0 1 1 0 1 2 1 1 0 0 3 1
50 0 1 2 2 1 1 2 0 1 1 2 0 1 2 0 0 3 5 4 2
51 0 2 2 0 3 1 1 0 0 0 2 1 2 2 0 1 0 0 0 2
52 0 0 4 1 1 1 0 2 0 1 0 0 1 1 0 0 3 0 1 2
53 0 2 0 1 0 0 1 2 0 2 1 1 1 1 3 0 1 1 2 0
54 0 0 1 2 2 1 1 0 0 0 4 0 0 1 0 1 3 1 1 0
55 0 0 1 2 2 1 0 0 0 1 1 0 0 2 2 0 3 0 4 3
56 0 2 2 1 2 1 1 0 0 0 1 0 2 0 0 0 0 0 0 2
57 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0
58 0 3 1 0 0 1 0 0 0 2 1 0 0 1 0 2 0 1 1 1
59 0 2 1 0 2 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0
60 0 0 0 0 2 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1
In [35]:
#ct2 = ct.reset_index()
In [36]:
#ct2
In [37]:
#ct4 = ct2.set_index('Age')
In [38]:
#ct4
In [ ]:
 
In [ ]:
 
In [39]:
df = ct
In [40]:
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

#文字だとプロットできないのでcolumn名を数字に変更
#df.rename(columns={'A':1, 'B':2, 'C':3}, inplace=True)

X = np.array([])
Y = np.array([])
Z = np.array([])

for i in range(df.index.size):
    X = np.concatenate([X, np.full(df.columns.size, df.index[i])], 0)

for i in range(df.index.size):
    Y = np.concatenate([Y, np.array(df.columns)], 0)

for i in range(df.index.size):
    Z = np.concatenate([Z, np.array(df[i:i+1])[0]], 0)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(X, Y, Z)
In [41]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
 
#タイトルで漢字が使えるようフォントを設定
plt.rcParams['font.family'] = 'Meiryo'
 
#描画エリアの作成
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
 
#グラフタイトルを設定
ax.set_title("3D散布図",size=20)
 
#軸ラベルのサイズと色を設定
ax.set_xlabel("x軸",size=15,color="black")
ax.set_ylabel("y軸",size=15,color="black")
ax.set_zlabel("z軸",size=15,color="black")
 
#numpyを使ってXYZの値を設定
x = np.random.rand(200,1) 
y = np.random.rand(200,1)
z = np.random.rand(200,1)
 
#散布図の作成
ax.scatter(x,y,z,s=40,c="red")
 
#描画
plt.show()
In [42]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
 
#タイトルで漢字が使えるようフォントを設定
plt.rcParams['font.family'] = 'Meiryo'
 
#描画エリアの作成
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
 
#グラフタイトルを設定
ax.set_title("3D散布図",size=20)
 
#軸ラベルのサイズと色を設定
ax.set_xlabel("x軸",size=15,color="black")
ax.set_ylabel("y軸",size=15,color="black")
ax.set_zlabel("z軸",size=15,color="black")
 
#x,yデータの作成
data = np.linspace(-3*np.pi,3*np.pi,50)
x, y = np.meshgrid(data,data)
 
# zデータの作成
z = np.cos(x/np.pi)*np.sin(y/np.pi)
 
#散布図の作成
#ワイヤーフレームを描く
ax.plot_wireframe(x,y,z,color="darkblue")
#ax.scatter(x,y,z,s=1,c="red")
 
#描画
plt.show()
In [43]:
import numpy as np

x = np.arange(0, 6, 1)
y = np.arange(0, 4, 1)
X, Y = np.meshgrid(x, y)
In [44]:
print(X)
[[0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [0 1 2 3 4 5]
 [0 1 2 3 4 5]]
In [45]:
import numpy as np

x = np.arange(0, 6, 1)
y = np.arange(0, 6, 1)
Z = np.meshgrid(x, y)
In [46]:
print(Z)
[array([[0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5],
       [0, 1, 2, 3, 4, 5]]), array([[0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2, 2],
       [3, 3, 3, 3, 3, 3],
       [4, 4, 4, 4, 4, 4],
       [5, 5, 5, 5, 5, 5]])]
In [ ]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
 
#タイトルで漢字が使えるようフォントを設定
plt.rcParams['font.family'] = 'Meiryo'
 
#描画エリアの作成
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
 
#グラフタイトルを設定
ax.set_title("3D散布図",size=20)
 
#軸ラベルのサイズと色を設定
ax.set_xlabel("x軸",size=15,color="black")
ax.set_ylabel("y軸",size=15,color="black")
ax.set_zlabel("z軸",size=15,color="black")
 
#numpyを使ってXYZの値を設定
#x = np.random.rand(200,1) 
#y = np.random.rand(200,1)
#Z = np.random.rand(200,1)
 
#散布図の作成
ax.plot_wireframe(x,y,Z,s=40,c="red")
 
#描画
plt.show()
In [ ]:
 
In [ ]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from   matplotlib import cm
from   mpl_toolkits.mplot3d import Axes3D
from   scipy.stats import multivariate_normal

N = 60
X = np.linspace(-3, 3, N)
Y = np.linspace(-3, 4, N)
X, Y = np.meshgrid(X, Y)

mu = np.array([0., 1.])
Sigma = np.array([[ 1. , -0.5], [-0.5,  1.5]])

pos = np.empty(X.shape + (2,))
pos[:, :, 0] = X
pos[:, :, 1] = Y

F = multivariate_normal(mu, Sigma)
Z = F.pdf(pos)

fig = plt.figure()
ax = fig.add_subplot(projection='3d')
#ax = fig.gca(projection='3d')
ax.plot_surface(X, Y, Z, rstride=3, cstride=3, linewidth=1, antialiased=True, cmap=cm.viridis)

plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
print(z)
In [ ]:
ct4
In [ ]:
ct3.to_csv('ct4.csv', encoding='utf-8', index=False)
In [ ]:
ct3 = ct2.reset_index()
In [ ]:
ct3
In [ ]:
import pandas as pd
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np


X = np.array([])
Y = np.array([])
Z = np.array([])

for i in range(df.index.size):
    X = np.concatenate([X, np.full(df.columns.size, df.index[i])], 0)

for i in range(df.index.size):
    Y = np.concatenate([Y, np.array(df.columns)], 0)

for i in range(df.index.size):
    Z = np.concatenate([Z, np.array(df[i:i+1])[0]], 0)

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
sc = ax.bar3d(X, Y, Z)
In [ ]:
 
In [ ]:
df['Age']
df['MonthlyRate']
In [ ]:
#input
import matplotlib.pyplot as plt
import numpy as np

#fig = plt.figure()
ax = fig.add_subplot(projection='3d')

x = df['Age']
y = df['MonthlyRate']
xx, yy = np.meshgrid(x, y)

xr = xx.ravel()
yr = yy.ravel()

height = xr + yr + 1

bottom = np.ones_like(height)
width = 0.5
depth = 0.5

#ax.bar3d(xr, yr, bottom, width, depth, height, color='lime')
plt.show()
In [ ]:
#input
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
ax = fig.add_subplot(projection='3d')

x = np.arange(3)
y = np.arange(3)
xx, yy = np.meshgrid(x, y)

xr = xx.ravel()
yr = yy.ravel()

height = xr + yr + 1

bottom = np.ones_like(height)
width = 0.5
depth = 0.5

ax.bar3d(xr, yr, bottom, width, depth, height, color='lime')
plt.show()
In [ ]:
print(xr)
In [ ]:
print(xx)
In [ ]:
## Drop veil-type because it offers no information
print("Before", df.shape)
df = df.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis = 1)
print("After", df.shape)
df_overview = df_overview.drop(['EmployeeCount', 'Over18', 'StandardHours'], axis = 0)
df_overview
In [ ]:
## Examine unique values
for i in df.columns:
  print(i, df[i].unique())
In [ ]:
df.shape
In [ ]:
#mkdir sqliteDB 
In [ ]:
cd sqliteDB
In [ ]:
ls
In [ ]:
import sqlite3

#HR_Employee_Attrition.db という名前のDBをなければ作成して接続する
dbname = 'HR_Employee_Attrition.db'

conn = sqlite3.connect(dbname)
cur = conn.cursor()
In [ ]:
#DBファイルが作成されたかを確認する
%ls
In [ ]:
# tableのnameを"HR_Employee_Attrition_tab"とし、読み込んだcsvファイルをsqlに書き込む
# index=False としてindex は書き込まないようにする
# DBには今回のテーブルが一つだけあることになる
# if_existsで書き換える
df.to_sql('HR_Employee_Attrition_tab', conn, if_exists='replace', index=False)
In [ ]:
#DBファイルにデータが書き込まれたことを確認する
%ls
In [ ]:
dbname = "HR_Employee_Attrition.db"
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをpandasで読み出す。
df = pd.read_sql('SELECT * FROM HR_Employee_Attrition_tab', conn)

cur.close()
conn.close()
In [ ]:
df
In [ ]:
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをSQL 文を使ってpandas Dataframeで読み出す。
df = pd.read_sql('SELECT "Department", "Age", "Attrition", "BusinessTravel", "DailyRate" FROM HR_Employee_Attrition_tab WHERE "BusinessTravel"="Travel_Frequently" AND "DailyRate" > 1350' , conn)
In [ ]:
df
In [ ]:
query = """
    SELECT 
          "Department", 
          "Age", 
          "Attrition", 
          "BusinessTravel", 
          "DailyRate"
    FROM  "HR_Employee_Attrition_tab" 
    WHERE "BusinessTravel"="Travel_Frequently" 
    AND   "DailyRate" < 1350
    AND   "Age" > 36
  """
In [ ]:
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをSQL 文を使ってpandas Dataframeで読み出す。
df = pd.read_sql(query, conn)
In [ ]:
df
In [ ]:
dbname = "HR_Employee_Attrition.db"
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをpandasで読み出す。
# WHERE で条件に合致した要素のみを読み込み、それをDataFrame に格納する
df = pd.read_sql('SELECT "Department", "Age", "Attrition" FROM HR_Employee_Attrition_tab WHERE "TotalWorkingYears" > 15 ', conn)
In [ ]:
df
In [ ]:
dbname = "HR_Employee_Attrition.db"
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをpandasで読み出す。
df = pd.read_sql('SELECT * FROM HR_Employee_Attrition_tab', conn)

cur.close()
conn.close()

pd.options.display.max_rows = 220
columns = df.columns
columns = pd.DataFrame(columns)
columns.head(220)
In [ ]:
 
In [ ]:
df
In [ ]:
df["Department"].value_counts()
In [ ]:
dbname = "HR_Employee_Attrition.db"
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをpandasで読み出す。
df = pd.read_sql('SELECT * FROM HR_Employee_Attrition_tab', conn)
In [ ]:
df
In [ ]:
df["EducationField"].value_counts()
In [ ]:
df = df[df["EducationField"]=="Life Sciences"]
In [ ]:
df
In [ ]:
dbname = "HR_Employee_Attrition.db"
conn = sqlite3.connect(dbname)
cur = conn.cursor()

# dbをpandasで読み出す。
df = pd.read_sql('SELECT * FROM HR_Employee_Attrition_tab', conn)
In [ ]:
df
In [ ]:
 
In [ ]:
 
In [ ]: